package data_deepprocessing.algorithm.ssvm.ssvm_util;

import java.io.BufferedWriter;
import java.io.IOException;
import java.util.List;

import org.deeplearning4j.models.word2vec.Word2Vec;

import data_deepprocessing.algorithm.word_embedding.service.YYH_Word2VectorService;
import data_deepprocessing.prepareData.beans.YYH_XianBingShiBean;
import data_deepprocessing.prepareData.db.SymptomDB;
import data_deepprocessing.prepareData.db.YYH_XianBingShiDB;
import data_deepprocessing.util.FilePathUtil;
import data_deepprocessing.util.FileUtil;

/** 
* @author  作者 : YUHU YUAN
* @date 创建时间：2017年3月31日 下午10:27:31 
* @version 1.0  
* 1:得到List<XianBingSHiBean> list；
* 2:得到分词后的结果；
* 3：通过word embedding 的到vector和 model文件
* 4：利用list中的信息何vector和model文件生成 ssvm要用的配置文件
* 5：制作训练集和测试集的数据
* 6：调用算法准备开始试验。
*	现在试验设计和实验方法部分就好写了吧，这部分的内容好好的组织一下 
*   
*	注释：XBS均指XianBingShi
*/

public class SsvmService {
	
	private SymptomDB symptomDB;
	private YYH_XianBingShiDB yyh_XianBingShiDB;
	
	//这里先自己找一部分 然后再实验一下 十重交叉验证，还有现在到底能不能用还不能确定
	//希望生成的数据集没有问题把，现在先全部的训练一遍
	private List<YYH_XianBingShiBean> doGetSegmentedXianBingShi(){
		return yyh_XianBingShiDB.selectAllXianBingShiOrderById();
	}
	
	private Word2Vec doGetWord2Vec(){
		
		try {
			return YYH_Word2VectorService.getWord2Vec();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return null;
	}
	
	private List<String> doGetSymptom_Dictionary(){
		return symptomDB.selectAllDistinctSymptomContent();
	}
	
	public void doGenerateSsvmDataSet() throws IOException{
		BufferedWriter writer = FileUtil.getWriter(FilePathUtil.SSVM_FORMAT_FILE_PATH);
		BufferedWriter writer2judge = FileUtil.getWriter(FilePathUtil.SSVM_FORMAT_FILE_JUDGE_PATH);
		List<YYH_XianBingShiBean> xianbingshiBeanList = doGetSegmentedXianBingShi();
		List<String> symptom_dictionary = doGetSymptom_Dictionary();//这里换成map会不会查找的更快
		Word2Vec word2Vec = doGetWord2Vec();
		StringBuffer constructTerm = new StringBuffer();
		StringBuffer constructTerm2judge = new StringBuffer();
		for(YYH_XianBingShiBean bean : xianbingshiBeanList){
			String segmentedXBS = bean.getContent_segmented();
			if(segmentedXBS.length()<20){
				continue;
			}
			String[] wordOrPhrases = segmentedXBS.trim().split("[\\s]+");
			for(String word:wordOrPhrases){
				if(word==null && word.equals("")&&word.trim().equals("")){
					continue;
				}
				double[] voctor = word2Vec.getWordVector(word);
				String tag = (symptom_dictionary.contains(word)?1:2)+"";
				constructTerm.append(tag).append(" ");
				constructTerm.append("qid:").append(bean.getId()).append(" ");
				constructTerm2judge.append(tag).append(" ");
				constructTerm2judge.append("qid:").append(bean.getId()).append(" ");
				if(voctor!=null ){
					
					for(int i=0; i<voctor.length; ++i){
						constructTerm.append((i+1)+":"+voctor[i]).append(" ");
					}
				}
				constructTerm.append("#").append(word);
				constructTerm2judge.append("#").append(word);
				writer.write(constructTerm.toString());
				writer.flush();
				writer.newLine();
				writer2judge.write(constructTerm2judge.toString());
				writer2judge.flush();
				writer2judge.newLine();
				constructTerm.setLength(0);
				constructTerm2judge.setLength(0);
			}
		}
		if(writer!=null){
			writer.close();
		}
		if(writer2judge!=null){
			writer2judge.close();
		}
		
	}

	
	
	/**
	 * 剩下的事情就是调用SSVM的算法了，这个后天的时候在做这件事情，明天白天把CRFs的实验做完
	 * 
	 */
	public SymptomDB getSymptomDB() {
		return symptomDB;
	}

	public void setSymptomDB(SymptomDB symptomDB) {
		this.symptomDB = symptomDB;
	}

	public YYH_XianBingShiDB getYyh_XianBingShiDB() {
		return yyh_XianBingShiDB;
	}

	public void setYyh_XianBingShiDB(YYH_XianBingShiDB yyh_XianBingShiDB) {
		this.yyh_XianBingShiDB = yyh_XianBingShiDB;
	}
	
}















